Getting the NetworkX Tree

In order to go further into the idea of the AST, we need to expand out the info we have, and need a good data structure for this. Because the AST is defined recursively and there don't seem to be great libraries for working with it, we need a better way.


In [1]:
# Necessary imports 
import os
import time
from nbminer.notebook_miner import NotebookMiner
from nbminer.cells.cells import Cell
from nbminer.features.ast_features import ASTFeatures
from nbminer.stats.summary import Summary
from nbminer.stats.multiple_summary import MultipleSummary

In [2]:
#Loading in the notebooks
people = os.listdir('../testbed/Final')
notebooks = []
for person in people:
    person = os.path.join('../testbed/Final', person)
    if os.path.isdir(person):
        direc = os.listdir(person)
        notebooks.extend([os.path.join(person, filename) for filename in direc if filename.endswith('.ipynb')])
notebook_objs = [NotebookMiner(file) for file in notebooks]
a = ASTFeatures(notebook_objs)

In [3]:
examp_nb = a.get_notebook(0)

In [4]:
print (examp_nb.get_number_cells())


23

In [5]:
new_segmentation = examp_nb.get_new_notebook()

In [6]:
print (new_segmentation.get_number_cells())


131

In [7]:
for i, nb in enumerate(a.nb_features):
    a.nb_features[i] = nb.get_new_notebook()

In [8]:
total_segments = 0
for nb in a.nb_features:
    for cell in nb.get_all_cells():
        total_segments += 1
        if len(cell.get_feature('ast').body) != 1:
            print ("Failed")

In [9]:
print (total_segments)


19882

In [10]:
all_types = []
for nb in a.nb_features:
    for cell in nb.get_all_cells():
        t = type(cell.get_feature('ast').body[0])
        all_types.append(t)

In [11]:
counting_dict = {}
for t in all_types:
    if t not in counting_dict:
        counting_dict[t] = 0
    counting_dict[t] += 1

In [14]:
import ast
                    
cells = new_segmentation.get_all_cells()
a = cells[17].get_feature('ast')
for el in a.body:
    for node in ast.iter_child_nodes(el):
        print(node)
    print (ast.dump(el))


<_ast.Attribute object at 0x110b5ed68>
<_ast.Subscript object at 0x110b5ee48>
Assign(targets=[Attribute(value=Name(id='df_eth', ctx=Load()), attr='index', ctx=Store())], value=Subscript(value=Name(id='df_eth', ctx=Load()), slice=Index(value=Str(s='id')), ctx=Load()))

In [19]:
import networkx

In [20]:
ast_tree = networkx.Graph.DirectedTree()


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-20-bdc1088fed8c> in <module>()
----> 1 ast_tree = networkx.Graph.DirectedTree()

AttributeError: type object 'Graph' has no attribute 'DirectedTree'

In [32]:
dgraph = networkx.DiGraph()
from collections import deque
nodes = deque()
nodes.append(a.body[0])
dgraph.add_node(a.body[0])
while len(nodes) != 0:
    cur_node = nodes.pop()
    for node in ast.iter_child_nodes(cur_node):
        dgraph.add_node(node)
        dgraph.add_edge(cur_node,node)
        nodes.append(node)

In [36]:
print (dgraph.nodes())
print (dgraph.edges())


[<_ast.Assign object at 0x110b5e5c0>, <_ast.Attribute object at 0x110b5ed68>, <_ast.Subscript object at 0x110b5ee48>, <_ast.Name object at 0x110b5e2e8>, <_ast.Index object at 0x110b5e828>, <_ast.Load object at 0x102c47d68>, <_ast.Str object at 0x110b5e400>, <_ast.Name object at 0x110b5ec18>, <_ast.Store object at 0x102c47e80>]
[(<_ast.Assign object at 0x110b5e5c0>, <_ast.Attribute object at 0x110b5ed68>), (<_ast.Assign object at 0x110b5e5c0>, <_ast.Subscript object at 0x110b5ee48>), (<_ast.Attribute object at 0x110b5ed68>, <_ast.Name object at 0x110b5ec18>), (<_ast.Attribute object at 0x110b5ed68>, <_ast.Store object at 0x102c47e80>), (<_ast.Subscript object at 0x110b5ee48>, <_ast.Name object at 0x110b5e2e8>), (<_ast.Subscript object at 0x110b5ee48>, <_ast.Index object at 0x110b5e828>), (<_ast.Subscript object at 0x110b5ee48>, <_ast.Load object at 0x102c47d68>), (<_ast.Name object at 0x110b5e2e8>, <_ast.Load object at 0x102c47d68>), (<_ast.Index object at 0x110b5e828>, <_ast.Str object at 0x110b5e400>), (<_ast.Name object at 0x110b5ec18>, <_ast.Load object at 0x102c47d68>)]

In [43]:
def return_graph(node):
    dgraph = networkx.DiGraph()
    nodes = deque()
    nodes.append(node.body[0])
    dgraph.add_node(node.body[0])
    while len(nodes) != 0:
        cur_node = nodes.pop()
        for node in ast.iter_child_nodes(cur_node):
            dgraph.add_node(node)
            dgraph.add_edge(cur_node,node)
            nodes.append(node)
    return dgraph

In [59]:
graphs = [return_graph(c.get_feature('ast')) for c in cells]
roots = [c.get_feature('ast').body[0] for c in cells]

In [45]:
len(graphs)


Out[45]:
131

In [67]:
max_values = []
for n in range(len(graphs)):
    max_values.append( max(networkx.shortest_path_length(graphs[n],roots[n]).values()))

In [69]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.hist(max_values)


Out[69]:
(array([ 12.,  10.,  60.,  23.,   0.,  10.,   5.,   3.,   6.,   2.]),
 array([ 1. ,  1.8,  2.6,  3.4,  4.2,  5. ,  5.8,  6.6,  7.4,  8.2,  9. ]),
 <a list of 10 Patch objects>)

In [ ]: